import numpy as np
from scipy import linalg
import scipy.io as sio
import random
from statistics import mean
from math import pi, sin, cos
from API.camera import Camera
from API.functions3D import *
from API.linear_solver import *
from API.nonlinear_solver import *
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import open3d as o3d
import pickle
Jupyter environment detected. Enabling Open3D WebVisualizer. [Open3D INFO] WebRTC GUI backend enabled. [Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.

One of the most fundamental problems in multiple view geometry is the problem of triangulation, the process of determining the location of a 3D point given its projections into two or more images.
Triangulation determines :



The projection of a real-world point $P$ to the digital image plane is: $$p = M P$$
Here $M$ is the camera matrix, which can be written as :
$$M = \begin{bmatrix} (f+z_0)k & 0 & c_x \\ 0 & (f+z_0)l & c_y \\ 0 & 0 & 1 \end{bmatrix} \begin{bmatrix} I_3 & 0 \end{bmatrix} \begin{bmatrix} R & T \\ 0 & 1 \end{bmatrix} = K \begin{bmatrix} I_3 & 0 \end{bmatrix} \begin{bmatrix} R & T \\ 0 & 1 \end{bmatrix} $$where :

In this section we show how we get the 2D projections of a 3D point cloud if we know the camera matrices.

array_object=np.load('NPZ/transformed_point_cloud_array_skeleton.npz')
transformed_point_cloud_arr_npz=array_object['transformed_point_cloud_array']
point_cloud_arr_npz=array_object['point_cloud_array']
box_sides=array_object['dist']
# Downsample points in the point cloud
downsampled_dataset_size = 100000
downsampled_dataset=random_downsampler(point_cloud_arr_npz, downsampled_dataset_size)
x_component=extract_orthogonal_component(downsampled_dataset, 0)
y_component=extract_orthogonal_component(downsampled_dataset, 1)
z_component=extract_orthogonal_component(downsampled_dataset, 2)
Downsampled dataset size: 100000
num_points = downsampled_dataset_size
P_x_array = x_component
P_y_array = y_component
P_z_array = z_component
P_array = np.zeros((num_points, 3))
for index in range(num_points):
P_array[index] = np.array([P_x_array[index], P_y_array[index], P_z_array[index]])
# Visualize P in 3D plot
fig = plt.figure(dpi=400)
ax_3d = fig.add_subplot(projection='3d')
ax_3d.scatter(P_array[:, 0], P_array[:, 1], P_array[:, 2], ".")
ax_3d.set_xlabel("X_axis")
ax_3d.set_ylabel("Y_axis")
ax_3d.set_zlabel("Z_axis")
Text(0.5, 0, 'Z_axis')
num_cameras = 6
# z (Camera Distance along the z-axis, outside the bounding box (Can be a random value))
z = 6*max(box_sides)
# The Cameras are homogeneous, so keep the intrinsic parameters the same
f = 1
z0 = 1
k = 1
l =1
c_x = 0
c_y = 0
alpha_array = [0, 0, 0, 0, 0, 0]
beta_array = [0, np.pi/4, 3*np.pi/4, np.pi, 5*np.pi/4, 7*np.pi/4]
gamma_array = [0, 0, 0, 0, 0, 0]
x_array = [0, z/np.sqrt(2), z/np.sqrt(2), 0, -z/np.sqrt(2), -z/np.sqrt(2)]
y_array = [0, 0, 0, 0, 0, 0]
z_array = [z, z/np.sqrt(2), -z/np.sqrt(2), -z, -z/np.sqrt(2), z/np.sqrt(2)]
# Visualize P in 3D plot
elev=0
azim=90
roll=0
fig = plt.figure(dpi=400)
ax_3d = fig.add_subplot(projection='3d')
ax_3d.view_init(elev, azim, roll)
ax_3d.scatter(P_array[:, 0], P_array[:, 1], P_array[:, 2], ".", color="green")
ax_3d.set_xlabel("X_axis")
ax_3d.set_ylabel("Y_axis")
ax_3d.set_zlabel("Z_axis")
ax_3d.grid(True)
ax_3d.set_xticklabels([])
ax_3d.set_yticklabels([])
ax_3d.set_zticklabels([])
ax_3d.scatter(x_array, y_array, z_array, ".", color="blue")
u_array = []
v_array = []
for beta_idx in range(len(beta_array)):
u_array.append(-cos(beta_array[beta_idx]))
v_array.append(-sin(beta_array[beta_idx]))
for i in range(len(x_array)):
ax_3d.quiver(x_array[i], y_array[i], z_array[i], v_array[i], 0, u_array[i], length = 5000, normalize = True, color="red")
Let the extrinsic calibration of the camera wrt Ground coordinates be :
Consider the camera matrix for 2D projection from the camera perspective.
camera_dict = {}
for idx in range(num_cameras):
cam = Camera(idx+1)
cam.set_intrinsic_matrix_params(f, z0, k, l, c_x, c_y)
# Ground to Camera Perspective (Have to invert the Rotation and Translation values)
cam.set_extrinsic_rot_matrix_params(-alpha_array[idx], -beta_array[idx], -gamma_array[idx])
cam.set_extrinsic_translation_matrix_params(-x_array[idx], -y_array[idx], -z_array[idx])
rotation_matrix = cam.get_3D_rotation_matrix()
translation_matrix = cam.get_3D_translation_matrix()
intrinsic_matrix = cam.get_intrinsic_matrix()
camera_dict.update({idx : {"Camera ID": cam.camera_id, "Camera Obj": cam,
"Camera Matrix": cam.get_camera_matrix(intrinsic_matrix, rotation_matrix, translation_matrix)}})
p_array_2d_dict = {}
for camera_index in range(num_cameras):
p_array_2d_dict.update({camera_index:{"Camera ID": camera_index+1, "Data": {}}})
for point_index in range(num_points):
point_info = P_array[point_index].reshape((3, 1)) # Change to a column matrix
P_3d_homogeneous = np.block([[point_info], [1]]) # 3D homogeneous point representation
p_2d_homogeneous = np.matmul(camera_dict[camera_index]["Camera Matrix"], P_3d_homogeneous)
p_2d_homogeneous_scaled = p_2d_homogeneous[:]/p_2d_homogeneous[-1]
# A check for lesser values of scaling factor
if(abs(p_2d_homogeneous[-1]) < 0.01):
print(f"Less value of scaling factor:{p_2d_homogeneous} ")
print(f"3D Point: {P_3d_homogeneous}")
p_array_2d_dict[camera_index]["Data"].update({point_index: {"3D": point_info, "2D": p_2d_homogeneous_scaled[0:-1]}})
for camera_index in range(num_cameras):
x_pos_arr = []
y_pos_arr = []
for point_index, point_info in p_array_2d_dict[camera_index]["Data"].items():
x_pos_arr.append(point_info["2D"][0][0])
y_pos_arr.append(point_info["2D"][1][0])
print("Parameters of Camera "+str(p_array_2d_dict[camera_index]["Camera ID"])+": ")
print("Rotation along X: {} degrees".format((-camera_dict[camera_index]["Camera Obj"].rot_x)*(180/(pi))))
print("Rotation along Y: {} degrees".format((-camera_dict[camera_index]["Camera Obj"].rot_y)*(180/(pi))))
print("Rotation along Z: {} degrees".format((-camera_dict[camera_index]["Camera Obj"].rot_z)*(180/(pi))))
print("Translation along X: {}".format(-camera_dict[camera_index]["Camera Obj"].translate_x))
print("Translation along Y: {}".format(-camera_dict[camera_index]["Camera Obj"].translate_y))
print("Translation along Z: {}".format(-camera_dict[camera_index]["Camera Obj"].translate_z))
plt.figure(dpi=800)
plt.plot(x_pos_arr, y_pos_arr, ".", markersize=1.6)
# , cmap="viridis")
plt.xlabel("X-coordinate")
plt.ylabel("Y-coordinate")
plt.title("2D Image from Camera "+str(p_array_2d_dict[camera_index]["Camera ID"]))
plt.axis("off")
plt.show()
Parameters of Camera 1: Rotation along X: 0.0 degrees Rotation along Y: 0.0 degrees Rotation along Z: 0.0 degrees Translation along X: 0 Translation along Y: 0 Translation along Z: 10000.502197265625
Parameters of Camera 2: Rotation along X: 0.0 degrees Rotation along Y: 45.0 degrees Rotation along Z: 0.0 degrees Translation along X: 7071.422918957492 Translation along Y: 0 Translation along Z: 7071.422918957492
Parameters of Camera 3: Rotation along X: 0.0 degrees Rotation along Y: 135.0 degrees Rotation along Z: 0.0 degrees Translation along X: 7071.422918957492 Translation along Y: 0 Translation along Z: -7071.422918957492
Parameters of Camera 4: Rotation along X: 0.0 degrees Rotation along Y: 180.0 degrees Rotation along Z: 0.0 degrees Translation along X: 0 Translation along Y: 0 Translation along Z: -10000.502197265625
Parameters of Camera 5: Rotation along X: 0.0 degrees Rotation along Y: 225.0 degrees Rotation along Z: 0.0 degrees Translation along X: -7071.422918957492 Translation along Y: 0 Translation along Z: -7071.422918957492
Parameters of Camera 6: Rotation along X: 0.0 degrees Rotation along Y: 315.0 degrees Rotation along Z: 0.0 degrees Translation along X: -7071.422918957492 Translation along Y: 0 Translation along Z: 7071.422918957492
As the name suggests, line of sight is based on the principle of a singular ray that traces and connects the object in observation plane or space to a corresponding lower dimensional point in the image plane
Here, we trace one ray for each of $N=2$ cameras from a point $P$ in the global 3D space that transform to points $p$ and $p'$ in the two 2D image planes.
Known parameters:
Unkown parameters:


Here, we describe a simple linear triangulation method that solves the lack of an intersection point between rays. We are given two points in the images that correspond to each other. We will go through this with the help of an example where we want to reconstruct a 3D point using $N=2$ cameras and thus, $N=2$ 2D images.
These points are, $$\begin{equation} p = MP = (x,y,1)\\ \text{ and }\\ p' = M'P = (x', y', 1) \end{equation}$$
Next, we play with a neat trick that applies for direction vectors, that is the null result of outer product of a vector with itself. Mathematically, this implies, for a given direction vector, $p$,
$$\begin{equation} p \times p = 0 \end{equation}$$Before we proceed any further, let us get a hold of the dimensions in each of these equations:
Through the notion of the null result of outer product of a vector with itself in direct linear transformation, we can form a system of linear equations using some known variables to find out the coordinates of the point in 3D space.
Thus, by definition and identity, $$ \begin{equation} p \times MP = 0\\ \\ \implies \begin{bmatrix} i_d & j_d & k_d\\ x & y & 1\\ M_1P & M_2P & M_3P\\ \end{bmatrix}=0 \end{equation} $$
Here, $M_i$ refers to the $i^{th}$ row of the camera matrix, $M$, and $i_d$, $j_d$ and $k_d$ are the unit vectors in the x, y and z directions respectively in the global 3D space.
We will expand this in the next sub-slide.
From this outer product, we obtain three equalities to form three constraints that follow:
$$ \begin{equation} x(M_3P)-M_1P=0\\ y(M_3P)-M_2P=0\\ x(M_2P)-y(M_1P)=0\\ \end{equation} $$Neat right?
Now let us proceed to do this for another camera, where the camera matrix is defined as $M'$ and the 2D projected point of the 3D point $P$, on this camera's 2D image is $p'$. So if we follow the same procedure, $$\begin{equation} p' \times M'P = 0\\ \\ \implies \begin{bmatrix} i & j & k\\ x' & y' & 1\\ M_1'P & M_2'P & M_3'P\\ \end{bmatrix} = 0 \end{equation} $$ And obtain another system of linear equations, $$ \begin{equation} x'(M_3'P)-M_1'P=0\\ y'(M_3'P)-M_2'P=0\\ x'(M_2'P)-y'(M_1'P)=0\\ \end{equation} $$
Once again, this example is built on $N=2$ cameras, but this framework can be extended to any arbitrary $N$ number of cameras. Coming back to our example, we have 3 linear equations per camera. However, only 2 of these 3 equations can be used as a constraint equation for each camera.
Which two equations? We only choose those equations with atleast one free variable.
Next, we construct a constraint matrix, $A$ in which every pair of rows are the constraints obtained from a particular camera. For this example,
$$ \begin{equation} A= \begin{bmatrix} x(M_3)-M_1\\ y(M_3)-M_2\\ x'(M_3')-M_1'\\ y'(M_3')-M_2'\\ \end{bmatrix} \end{equation} $$Notice, that $P$ is not present in this representation. This is because, we will form a matrix equation to find P through A. This is mathematically written as,
$$ \begin{equation} AP=0 \end{equation} $$Since, we are conducting this analysis through homogeneous image point representations, this method is thus called Homogeneous Direct Linear Transformation.
Thus, we realize that $P$ is the null space of $A$, and we can easily find $P$ using any matrix operations and algorithms, for example Singular Value Decomposition (SVD).
A key point to remember is that each camera contributes 2 constraints to the matrix $A$. Thus, for $N$ cameras, dimensions of $A$ will be $2N\times4$
Now, if we consider an SVD implementation, we implement the following steps:
Now, let us demonstrate the action of linear reconstruction on a dataset of 2D images simulated by $N=6$ cameras that we have oriented. Here, our assumptions of transformation in a noiseless environment.
with open('PKL/saved_points_dictionary_skeleton.pkl', 'rb') as p_array_2d_dict_file:
p_array_2d_dict = pickle.load(p_array_2d_dict_file)
with open('PKL/saved_camera_dictionary_skeleton.pkl', 'rb') as camera_dict_file:
camera_dict = pickle.load(camera_dict_file)
num_cameras = 6
num_points = 100000
def get_camera_partition_for_2D_points(p_array_2d_dict):
point_set_partition_dict = {}
for point_index in range(num_points):
camera_idx_array = [cam_index for cam_index in range(num_cameras) if point_index in p_array_2d_dict[cam_index]["Data"].keys()]
if len(camera_idx_array) > 1:
point_set_partition_dict.update({point_index:camera_idx_array})
return point_set_partition_dict
point_set_partition_dict = get_camera_partition_for_2D_points(p_array_2d_dict)
P_3D_array = []
error = []
for point_index, camera_idx_list in point_set_partition_dict.items():
# print("Point : {}".format(point_index))
# Form the camera list and the point perspective array list in the respective cameras
cam_mat_list = []
point_perspective_array_list = []
for camera_index in camera_idx_list:
cam_mat_list.append(camera_dict[camera_index]['Camera Matrix'])
point_perspective_array_list.append(p_array_2d_dict[camera_index]["Data"][point_index]["2D"])
P_3D_temp_dyn = linear_reconstruct_scp_dyn(point_perspective_array_list, cam_mat_list)
for row in range(len(P_3D_temp_dyn)):
P_3D_temp_dyn[row] = P_3D_temp_dyn[row]/P_3D_temp_dyn[3]
P_3D_temp_dyn=np.delete(P_3D_temp_dyn,3,0)
# print(P_3D_temp_dyn)
# print("Actual")
# print(p_array_2d_dict[camera_index]["Data"][point_index]["3D"])
error.append(np.linalg.norm(np.subtract(p_array_2d_dict[camera_index]["Data"][point_index]["3D"], P_3D_temp_dyn)) ** 2)
P_3D_array.append(P_3D_temp_dyn.flatten())
# np.sqrt(np.divide(error,len(error)))
# np.mean(error)
# plt.figure(dpi=300)
# plt.plot(error, linewidth=0.18)
visualize_polygon_from_arr_using_plt(P_3D_array)
reconstructed_point_cloud_array=[]
for i in range(len(P_3D_array)):
reconstructed_point=P_3D_array[i][:3]
reconstructed_point_cloud_array.append(reconstructed_point)
reconstructed_point_cloud_array=np.array(reconstructed_point_cloud_array)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(reconstructed_point_cloud_array)
o3d.visualization.draw_geometries([pcd])
We will now look at a more realistic simulation where we have added added Gaussian noise to our points in the 2D projected image, of form $N_G\sim\mathcal{N}(0,7.5)$. A high standard deviation accounts for a significant noise in the 2D image that has high dimensions.
The main limitation of random noise added to the image or camera matrices implies that there will be no exact solution of the previously established matrix equations. Thus, it requires a large number of runs to find a distibution that will result in a different solution after inducing noise in the 2D image matrices.
with open('PKL/saved_points_reconstructed_noisy_linear.pkl', 'rb') as file:
recon = pickle.load(file)
visualize_polygon_from_arr_using_plt(recon)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(recon)
o3d.visualization.draw_geometries([pcd])
Here, we will see the action of a nonlinear optimization problem formulation that helps overcome the limitations of Direct Linear Transformation.
As soon as we added noise (which is $n_G\in N_{G}\sim\mathcal{N}(0,7.5)$) to the 2D images, it can be written,
$$ \begin{equation} % p_{err} = p+n_G = MP+n_G = (x_{err},y_{err},1)\\ \hat{p} = p+n_G = MP+n_G = (x,y,1)+n_G = (\hat{x},\hat{y},1) \end{equation} $$where:
Therefore, in this case, $$ \begin{equation} \hat{p} \neq MP \end{equation} $$
Further, from DLT, we obtain some erroneous 3D point, $\hat{P}$, from the reprojection of $\hat{p}$ with $M$,
$$ \begin{equation} \hat{p} = M\hat{P} \end{equation} $$Now, we must reduce the distance between our erroneous points in the 2D image plane, which formulates itself as,
$$ \begin{equation} \begin{array}{rrclcl} \displaystyle \min_{\hat{P}} & \|M\hat{P}-p\|^2\\ % \textrm{s.t.} \\ % &\xi\geq0 \\ \end{array} \end{equation} $$Here, the erroneous 3D point $\hat{P}$ is interpreted as the guess or estimate of the minimization at a given iteration. We seek to find a $\hat{P}$ in 3D that best approximates $P$ by finding the best least-squares estimate of the reprojection error of $\hat{P}$
Now for $N$ cameras and hence, $N$ 2D images, we can rewrite this problem as the aggregation of each camera's error for that particular point $P_i$,
$$ \begin{equation} \begin{array}{rrclcl} \displaystyle \min_{\hat{P}} & \|M\hat{P_1}-p_1\|^2+\|M\hat{P_2}-p_2\|^2+\cdots+\|M\hat{P_N}-p_N\|^2\\ \end{array}\\ \end{equation} $$which can be further simplified as,
$$ \begin{equation} \begin{array}{rrclcl} \displaystyle \min_{\hat{P}} & \sum_{i=1}^{N} \|M\hat{P_i}-p_i\|^2\\ \end{array} \end{equation} $$In order to solve the nonlinear minimization problem, we have to represent the minimization objective through a specific residual function, $r(x)$, to avail the Gauss-Newton algorithm for nonlinear least squares. The general nonlinear least squares problem is to find an $x\in \mathbb{R}^n$ that minimizes, $$ \begin{equation} \|r(x)\|^2=\sum_{i=1}^{m} r_i(x)^2 \end{equation} $$ Here, $r:\mathbb{R}^n\rightarrow\mathbb{R}^m$ such that $r(x)=f(x)-y$ for some input $x$ to a function $f$ with expected output $y$
Let us adopt this framework for our problem. For that, define our reprojection error vector as, $e^i$ for the $i^{th}$ camera, where $e_i=M\hat{P}_i-p_i$, is a $2\times1$ vector (since here we are taking the inhomogeneous representation of the 2D image). Now, our objective function for the minimization problem becomes,
$$ \begin{equation} \begin{array}{rrclcl} \displaystyle \min_{\hat{P}} & \sum_{i=1}^{N} e_i(\hat{P})^2\\ \end{array}\\ \end{equation} $$where, $e$ is a function of the estimate of the 3D point, $\hat{P}$, with its true value known to be at $P$.
For Gauss Newton method, we need to start with an initial estimate from which the algorithm can improve upon. This initial estimate is found from the solutions of the linear system of equations explained earlier.
Till now, we have a nonlinear least squares problem to minimize. Now, the $\delta_P$ is chosen such that we can linearize the approximate the residual function near the current estimate $\hat{P}$.
Thus, the residual function becomes, $$ \begin{equation} e(\hat{P}+\delta_P)\approx e(\hat{P})+\frac{\partial e}{\partial P}\delta_P \end{equation} $$ After the linear approximation, objective of the minimization problem becomes,
$$ \begin{equation} \begin{array}{rrclcl} \displaystyle \min_{\delta_{P}} & \|\frac{\partial e}{\partial P}\delta_P-(-e(\hat{P}))\|^2\\ \end{array} \end{equation} $$Now, the format of the optimization problem has taken the shape of a linear least squares problem which we can solve easily. For our problem that deals with triangulation with $N$ cameras, the linear least squares solution is, $$ \begin{equation} \delta_P=-(J^TJ)^{-1}J^Te \end{equation} $$
where
$$e= \begin{bmatrix} e_1\\ \vdots\\ e_N\end{bmatrix} = \begin{bmatrix} M_1\hat{P}-p_1\\ \vdots\\ M_N\hat{P}-p_N\end{bmatrix} , \ \ \ \ \ J= \begin{bmatrix} \frac{\partial e_1}{\partial \hat{P_1}} & \frac{\partial e_1}{\partial \hat{P_2}} & \frac{\partial e_1}{\partial \hat{P_3}}\\ \vdots & \vdots & \vdots\\ \frac{\partial e_N}{\partial \hat{P_1}} & \frac{\partial e_N}{\partial \hat{P_2}} & \frac{\partial e_N}{\partial \hat{P_3}} \end{bmatrix} $$Here, let us remember the $e_i$ is a $2\times1$ vector, thus making $e$ a $2N\times1$ vector. Thus, the Jacobian, $J$ is a $2N\times3$ matrix
with open('PKL/saved_dictionary.pkl', 'rb') as p_array_2d_dict_file:
p_array_2d_dict = pickle.load(p_array_2d_dict_file)
with open('PKL/saved_camera_dictionary.pkl', 'rb') as camera_dict_file:
camera_dict = pickle.load(camera_dict_file)
with open('PKL/saved_points_reconstructed_noisy_linear.pkl', 'rb') as recon_file:
linear_estimates = pickle.load(recon_file)
with open('PKL/saved_points_reconstructed_linear.pkl', 'rb') as rec_file:
noiseless_linear_estimates = pickle.load(rec_file)
M_cam1 = camera_dict[0]["Camera Matrix"]
M_cam2 = camera_dict[1]["Camera Matrix"]
M_cam3 = camera_dict[2]["Camera Matrix"]
M_cam4 = camera_dict[3]["Camera Matrix"]
M2 = np.copy(M_cam2)
M3 = np.copy(M_cam3)
nonlinear_array = []
for i in range(len(p_array_2d_dict[0]["Data"])):
p1 = get_homogeneous_representation(True, p_array_2d_dict[0]["Data"][i]).flatten()
p2 = get_homogeneous_representation(True, p_array_2d_dict[1]["Data"][i]).flatten()
P_init = get_homogeneous_representation(0, linear_estimates[i].reshape(3,1)).flatten()
optimized_P = gauss_newton_stereo(P_init, M2, M3, p1, p2)
nonlinear_array.append(optimized_P)
rescale_to_inhomogeneous(nonlinear_array)
gauss_newton_reconstructed_polygon = np.array(nonlinear_array)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(gauss_newton_reconstructed_polygon)
o3d.visualization.draw_geometries([pcd])